import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder, StandardScaler
import tensorflow as tf
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from tensorflow.keras import layers, models, callbacks
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization, LeakyReLU
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.regularizers import l2
from tensorflow.keras.utils import plot_model
from tensorflow.keras.models import load_model
import matplotlib.pyplot as plt
large_train = "./424_F2024_Final_PC_large_train_v1.csv"
test_path = "./424_F2024_Final_PC_test_without_response_v1.csv"
model_path = "my_model_500epoch.h5"
df_train = pd.read_csv(large_train)
df_test = pd.read_csv(test_path)
df_train.sample(5)
| rating | year_review | firm | job_title | headline | pros | cons | |
|---|---|---|---|---|---|---|---|
| 283563 | 3 | 2018 | Oracle | Large company | large company with impressive market share | large company more focused on margin than profit | |
| 408546 | 5 | 2016 | PwC | Senior Associate | They have great benefits and really help grow ... | Great career growth, travel opportunities, and... | Big company and a lot of red tape to get throu... |
| 369588 | 4 | 2021 | Goldman-Sachs | Strategist | Good company to start your career | Good Perks, Culcure for junior to grow, Good M... | Benefit is not comparable to other banks |
| 253300 | 4 | 2021 | McDonald-s | Mcdonalds Fry Cook | It was ok for a first job | decent hours and good management | low opportunity to move up |
| 132776 | 2 | 2020 | Deloitte | Senior Consultant | Not a good company | Nothing much to say about this company | Not a good place to work |
df_test.sample(5)
| rating | year_review | firm | job_title | headline | pros | cons | |
|---|---|---|---|---|---|---|---|
| 24179 | NaN | 2017 | J-P-Morgan | Software Engineer | Unjust Dismissal | None that I can think of in the department I w... | I was dismissed on totally fabricated charges.... |
| 25271 | NaN | 2017 | Citi | Anonymous Employee | Former Corporate Banking Executive at Citigroup | A never say die attitude which helps push the ... | Large organization and has associated red tape... |
| 76017 | NaN | 2017 | IBM | Business Analyst | Good work culture | - Good environment to work in\r- Lots of flexi... | - Management not good.\r- Not hikes , appraisa... |
| 54714 | NaN | 2021 | EY | Advisory Senior Consultant | Great for Professional Advancement | Extensive resources and knowledge at your disp... | Very little consistency with upper management.... |
| 41838 | NaN | 2017 | British-Airways | PMO Manager | Actively seeking employment elsewhere | People that are enthusiastic about the commerc... | Run by accountants that know better than the p... |
nltk.download('stopwords')
nltk.download('wordnet')
# Initialize stop words and lemmatizer
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
def clean_text(text):
if pd.isnull(text):
return 'Unknown'
text = text.encode('ascii', 'ignore').decode()
text = re.sub(r'\s+', ' ', text)
text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
text = text.lower().strip()
words = text.split()
words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
return ' '.join(words)
# Preprocess the training and test data
def preprocess_data(df_train, df_test):
# Clean text columns
for col in ['pros', 'cons', 'headline']:
df_train[col] = df_train[col].apply(clean_text)
df_test[col] = df_test[col].apply(clean_text)
# Combine text columns for vectorization
combined_text_train = df_train['pros'] + " " + df_train['cons'] + " " + df_train['headline']
combined_text_test = df_test['pros'] + " " + df_test['cons'] + " " + df_test['headline']
# Initialize the TF-IDF vectorizer with an increased max_features
tfidf_vectorizer = TfidfVectorizer(max_features=1000) # Adjusted to capture more information
# Fit and transform the training data, transform the test data
combined_tfidf_train = tfidf_vectorizer.fit_transform(combined_text_train).toarray()
combined_tfidf_test = tfidf_vectorizer.transform(combined_text_test).toarray()
# Handle missing or empty values in 'job_title' and 'firm'
df_train['job_title'] = df_train['job_title'].fillna('Unknown')
df_test['job_title'] = df_test['job_title'].fillna('Unknown')
df_train['firm'] = df_train['firm'].fillna('Unknown')
df_test['firm'] = df_test['firm'].fillna('Unknown')
# Encode 'firm' using LabelEncoder
label_encoder_firm = LabelEncoder()
all_firm_values = pd.concat([df_train['firm'], df_test['firm']])
label_encoder_firm.fit(all_firm_values)
df_train['firm_encoded'] = label_encoder_firm.transform(df_train['firm'])
df_test['firm_encoded'] = label_encoder_firm.transform(df_test['firm'])
# Convert 'rating' to a numeric target for regression
y_train = df_train['rating'].values
y_test = df_test['rating'].values
# Combine all the features for both training and test sets
X_train = np.hstack([
combined_tfidf_train,
df_train[['firm_encoded']].values
]).astype('float32')
X_test = np.hstack([
combined_tfidf_test,
df_test[['firm_encoded']].values
]).astype('float32')
# Feature scaling
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
# Return processed features and labels
return X_train, X_test, y_train, y_test
# Preprocess the data
X_train, X_test, y_train, y_test = preprocess_data(df_train, df_test)
[nltk_data] Downloading package stopwords to [nltk_data] /Users/mayank/nltk_data... [nltk_data] Package stopwords is already up-to-date! [nltk_data] Downloading package wordnet to /Users/mayank/nltk_data... [nltk_data] Package wordnet is already up-to-date!
X_train.shape, X_test.shape
((500000, 1001), (100000, 1001))
X_train[:5]
array([[-0.08589483, -0.05596118, -0.05692343, ..., -0.13509741,
-0.05917969, -1.2568026 ],
[-0.08589483, -0.05596118, -0.05692343, ..., -0.13509741,
14.924771 , -0.19707541],
[-0.08589483, -0.05596118, -0.05692343, ..., -0.13509741,
-0.05917969, -1.2666148 ],
[-0.08589483, -0.05596118, -0.05692343, ..., -0.13509741,
-0.05917969, -1.7964784 ],
[-0.08589483, -0.05596118, -0.05692343, ..., -0.13509741,
-0.05917969, -0.4325703 ]], dtype=float32)
# Build the model with adjusted architecture
model = Sequential([
Dense(256, input_dim=X_train.shape[1], kernel_regularizer=l2(0.001)), # Added L2 regularization
BatchNormalization(),
LeakyReLU(alpha=0.1), # LeakyReLU for better gradient flow
Dropout(0.25), # Fine-tuned dropout rate
Dense(128, kernel_regularizer=l2(0.001)),
BatchNormalization(),
LeakyReLU(alpha=0.1),
Dropout(0.2),
Dense(64, kernel_regularizer=l2(0.001)),
BatchNormalization(),
LeakyReLU(alpha=0.1),
Dropout(0.15),
Dense(32, kernel_regularizer=l2(0.001)),
BatchNormalization(),
LeakyReLU(alpha=0.1),
Dense(1, activation='linear') # Linear activation for regression
])
# Compile the model
model.compile(optimizer=Adam(learning_rate=0.001),
loss='mean_squared_error',
metrics=['mse'])
# Callbacks
early_stopping = EarlyStopping(monitor='val_mse', patience=30, restore_best_weights=True)
reduce_lr = ReduceLROnPlateau(monitor='val_mse', factor=0.5, patience=15, min_lr=0.00001)
# Fit the model
history = model.fit(
X_train, y_train,
validation_split=0.2,
epochs=500, # Increased epochs for better learning
batch_size=64, # Option to try other sizes
callbacks=[early_stopping, reduce_lr],
verbose=2
)
# Save the trained model
model.save(model_path) # Save entire model to HDF5 format
WARNING:absl:At this time, the v2.11+ optimizer `tf.keras.optimizers.Adam` runs slowly on M1/M2 Macs, please use the legacy Keras optimizer instead, located at `tf.keras.optimizers.legacy.Adam`. WARNING:absl:There is a known slowdown when using v2.11+ Keras optimizers on M1/M2 Macs. Falling back to the legacy Keras optimizer, i.e., `tf.keras.optimizers.legacy.Adam`.
Epoch 1/500
2024-12-09 08:56:55.639082: W tensorflow/tsl/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz
6250/6250 - 13s - loss: 1.4434 - mse: 1.0142 - val_loss: 0.9616 - val_mse: 0.7804 - lr: 0.0010 - 13s/epoch - 2ms/step Epoch 2/500 Epoch 2/500 6250/6250 - 11s - loss: 0.9546 - mse: 0.8024 - val_loss: 0.8940 - val_mse: 0.7603 - lr: 0.0010 - 11s/epoch - 2ms/step Epoch 3/500 6250/6250 - 11s - loss: 0.9060 - mse: 0.7898 - val_loss: 0.8664 - val_mse: 0.7658 - lr: 0.0010 - 11s/epoch - 2ms/step Epoch 4/500 6250/6250 - 10s - loss: 0.8753 - mse: 0.7846 - val_loss: 0.8307 - val_mse: 0.7471 - lr: 0.0010 - 10s/epoch - 2ms/step Epoch 5/500 6250/6250 - 11s - loss: 0.8586 - mse: 0.7817 - val_loss: 0.8162 - val_mse: 0.7439 - lr: 0.0010 - 11s/epoch - 2ms/step Epoch 6/500 6250/6250 - 11s - loss: 0.8497 - mse: 0.7788 - val_loss: 0.8134 - val_mse: 0.7445 - lr: 0.0010 - 11s/epoch - 2ms/step Epoch 7/500 6250/6250 - 11s - loss: 0.8447 - mse: 0.7768 - val_loss: 0.8168 - val_mse: 0.7496 - lr: 0.0010 - 11s/epoch - 2ms/step Epoch 8/500 6250/6250 - 10s - loss: 0.8423 - mse: 0.7768 - val_loss: 0.8072 - val_mse: 0.7430 - lr: 0.0010 - 10s/epoch - 2ms/step Epoch 9/500 6250/6250 - 10s - loss: 0.8396 - mse: 0.7751 - val_loss: 0.8064 - val_mse: 0.7428 - lr: 0.0010 - 10s/epoch - 2ms/step Epoch 10/500 6250/6250 - 10s - loss: 0.8385 - mse: 0.7744 - val_loss: 0.8106 - val_mse: 0.7460 - lr: 0.0010 - 10s/epoch - 2ms/step Epoch 11/500 6250/6250 - 10s - loss: 0.8365 - mse: 0.7734 - val_loss: 0.8068 - val_mse: 0.7446 - lr: 0.0010 - 10s/epoch - 2ms/step Epoch 12/500 6250/6250 - 10s - loss: 0.8368 - mse: 0.7740 - val_loss: 0.8031 - val_mse: 0.7392 - lr: 0.0010 - 10s/epoch - 2ms/step Epoch 13/500 6250/6250 - 10s - loss: 0.8361 - mse: 0.7728 - val_loss: 0.8086 - val_mse: 0.7453 - lr: 0.0010 - 10s/epoch - 2ms/step Epoch 14/500 6250/6250 - 10s - loss: 0.8360 - mse: 0.7728 - val_loss: 0.8009 - val_mse: 0.7381 - lr: 0.0010 - 10s/epoch - 2ms/step Epoch 15/500 6250/6250 - 10s - loss: 0.8355 - mse: 0.7722 - val_loss: 0.8044 - val_mse: 0.7417 - lr: 0.0010 - 10s/epoch - 2ms/step Epoch 16/500 6250/6250 - 10s - loss: 0.8361 - mse: 0.7736 - val_loss: 0.8105 - val_mse: 0.7470 - lr: 0.0010 - 10s/epoch - 2ms/step Epoch 17/500 6250/6250 - 10s - loss: 0.8358 - mse: 0.7733 - val_loss: 0.8045 - val_mse: 0.7416 - lr: 0.0010 - 10s/epoch - 2ms/step Epoch 18/500 6250/6250 - 10s - loss: 0.8340 - mse: 0.7721 - val_loss: 0.7989 - val_mse: 0.7378 - lr: 0.0010 - 10s/epoch - 2ms/step Epoch 19/500 6250/6250 - 10s - loss: 0.8353 - mse: 0.7730 - val_loss: 0.8037 - val_mse: 0.7407 - lr: 0.0010 - 10s/epoch - 2ms/step Epoch 20/500 6250/6250 - 9s - loss: 0.8351 - mse: 0.7723 - val_loss: 0.8055 - val_mse: 0.7429 - lr: 0.0010 - 9s/epoch - 2ms/step Epoch 21/500 6250/6250 - 10s - loss: 0.8355 - mse: 0.7726 - val_loss: 0.8077 - val_mse: 0.7439 - lr: 0.0010 - 10s/epoch - 2ms/step Epoch 22/500 6250/6250 - 10s - loss: 0.8346 - mse: 0.7716 - val_loss: 0.8124 - val_mse: 0.7499 - lr: 0.0010 - 10s/epoch - 2ms/step Epoch 23/500 6250/6250 - 10s - loss: 0.8358 - mse: 0.7732 - val_loss: 0.8053 - val_mse: 0.7420 - lr: 0.0010 - 10s/epoch - 2ms/step Epoch 24/500 6250/6250 - 10s - loss: 0.8359 - mse: 0.7734 - val_loss: 0.8060 - val_mse: 0.7420 - lr: 0.0010 - 10s/epoch - 2ms/step Epoch 25/500 6250/6250 - 11s - loss: 0.8344 - mse: 0.7727 - val_loss: 0.8039 - val_mse: 0.7429 - lr: 0.0010 - 11s/epoch - 2ms/step Epoch 26/500 6250/6250 - 10s - loss: 0.8333 - mse: 0.7714 - val_loss: 0.8129 - val_mse: 0.7492 - lr: 0.0010 - 10s/epoch - 2ms/step Epoch 27/500 6250/6250 - 10s - loss: 0.8345 - mse: 0.7725 - val_loss: 0.8037 - val_mse: 0.7426 - lr: 0.0010 - 10s/epoch - 2ms/step Epoch 28/500 6250/6250 - 10s - loss: 0.8335 - mse: 0.7723 - val_loss: 0.7996 - val_mse: 0.7403 - lr: 0.0010 - 10s/epoch - 2ms/step Epoch 29/500 6250/6250 - 10s - loss: 0.8356 - mse: 0.7728 - val_loss: 0.8026 - val_mse: 0.7411 - lr: 0.0010 - 10s/epoch - 2ms/step Epoch 30/500 6250/6250 - 10s - loss: 0.8340 - mse: 0.7723 - val_loss: 0.8043 - val_mse: 0.7440 - lr: 0.0010 - 10s/epoch - 2ms/step Epoch 31/500 6250/6250 - 10s - loss: 0.8331 - mse: 0.7721 - val_loss: 0.8125 - val_mse: 0.7526 - lr: 0.0010 - 10s/epoch - 2ms/step Epoch 32/500 6250/6250 - 10s - loss: 0.8329 - mse: 0.7717 - val_loss: 0.8090 - val_mse: 0.7482 - lr: 0.0010 - 10s/epoch - 2ms/step Epoch 33/500 6250/6250 - 10s - loss: 0.8340 - mse: 0.7723 - val_loss: 0.8097 - val_mse: 0.7494 - lr: 0.0010 - 10s/epoch - 2ms/step Epoch 34/500 6250/6250 - 10s - loss: 0.8036 - mse: 0.7554 - val_loss: 0.7724 - val_mse: 0.7301 - lr: 5.0000e-04 - 10s/epoch - 2ms/step Epoch 35/500 6250/6250 - 10s - loss: 0.7945 - mse: 0.7531 - val_loss: 0.7705 - val_mse: 0.7300 - lr: 5.0000e-04 - 10s/epoch - 2ms/step Epoch 36/500 6250/6250 - 10s - loss: 0.7923 - mse: 0.7525 - val_loss: 0.7651 - val_mse: 0.7263 - lr: 5.0000e-04 - 10s/epoch - 2ms/step Epoch 37/500 6250/6250 - 10s - loss: 0.7926 - mse: 0.7531 - val_loss: 0.7703 - val_mse: 0.7307 - lr: 5.0000e-04 - 10s/epoch - 2ms/step Epoch 38/500 6250/6250 - 10s - loss: 0.7916 - mse: 0.7518 - val_loss: 0.7666 - val_mse: 0.7269 - lr: 5.0000e-04 - 10s/epoch - 2ms/step Epoch 39/500 6250/6250 - 10s - loss: 0.7911 - mse: 0.7510 - val_loss: 0.7657 - val_mse: 0.7261 - lr: 5.0000e-04 - 10s/epoch - 2ms/step Epoch 40/500 6250/6250 - 10s - loss: 0.7914 - mse: 0.7514 - val_loss: 0.7726 - val_mse: 0.7328 - lr: 5.0000e-04 - 10s/epoch - 2ms/step Epoch 41/500 6250/6250 - 10s - loss: 0.7920 - mse: 0.7519 - val_loss: 0.7675 - val_mse: 0.7275 - lr: 5.0000e-04 - 10s/epoch - 2ms/step Epoch 42/500 6250/6250 - 10s - loss: 0.7925 - mse: 0.7521 - val_loss: 0.7730 - val_mse: 0.7323 - lr: 5.0000e-04 - 10s/epoch - 2ms/step Epoch 43/500 6250/6250 - 10s - loss: 0.7916 - mse: 0.7512 - val_loss: 0.7691 - val_mse: 0.7291 - lr: 5.0000e-04 - 10s/epoch - 2ms/step Epoch 44/500 6250/6250 - 10s - loss: 0.7902 - mse: 0.7496 - val_loss: 0.7709 - val_mse: 0.7301 - lr: 5.0000e-04 - 10s/epoch - 2ms/step Epoch 45/500 6250/6250 - 10s - loss: 0.7914 - mse: 0.7507 - val_loss: 0.7689 - val_mse: 0.7285 - lr: 5.0000e-04 - 10s/epoch - 2ms/step Epoch 46/500 6250/6250 - 10s - loss: 0.7913 - mse: 0.7508 - val_loss: 0.7675 - val_mse: 0.7271 - lr: 5.0000e-04 - 10s/epoch - 2ms/step Epoch 47/500 6250/6250 - 10s - loss: 0.7912 - mse: 0.7505 - val_loss: 0.7687 - val_mse: 0.7280 - lr: 5.0000e-04 - 10s/epoch - 2ms/step Epoch 48/500 6250/6250 - 10s - loss: 0.7913 - mse: 0.7503 - val_loss: 0.7668 - val_mse: 0.7254 - lr: 5.0000e-04 - 10s/epoch - 2ms/step Epoch 49/500 6250/6250 - 10s - loss: 0.7912 - mse: 0.7502 - val_loss: 0.7660 - val_mse: 0.7249 - lr: 5.0000e-04 - 10s/epoch - 2ms/step Epoch 50/500 6250/6250 - 10s - loss: 0.7914 - mse: 0.7504 - val_loss: 0.7674 - val_mse: 0.7268 - lr: 5.0000e-04 - 10s/epoch - 2ms/step Epoch 51/500 6250/6250 - 10s - loss: 0.7910 - mse: 0.7502 - val_loss: 0.7734 - val_mse: 0.7320 - lr: 5.0000e-04 - 10s/epoch - 2ms/step Epoch 52/500 6250/6250 - 10s - loss: 0.7901 - mse: 0.7494 - val_loss: 0.7654 - val_mse: 0.7249 - lr: 5.0000e-04 - 10s/epoch - 2ms/step Epoch 53/500 6250/6250 - 10s - loss: 0.7910 - mse: 0.7501 - val_loss: 0.7684 - val_mse: 0.7280 - lr: 5.0000e-04 - 10s/epoch - 2ms/step Epoch 54/500 6250/6250 - 10s - loss: 0.7911 - mse: 0.7505 - val_loss: 0.7638 - val_mse: 0.7235 - lr: 5.0000e-04 - 10s/epoch - 2ms/step Epoch 55/500 6250/6250 - 10s - loss: 0.7911 - mse: 0.7503 - val_loss: 0.7698 - val_mse: 0.7289 - lr: 5.0000e-04 - 10s/epoch - 2ms/step Epoch 56/500 6250/6250 - 10s - loss: 0.7913 - mse: 0.7503 - val_loss: 0.7682 - val_mse: 0.7265 - lr: 5.0000e-04 - 10s/epoch - 2ms/step Epoch 57/500 6250/6250 - 10s - loss: 0.7907 - mse: 0.7494 - val_loss: 0.7686 - val_mse: 0.7270 - lr: 5.0000e-04 - 10s/epoch - 2ms/step Epoch 58/500 6250/6250 - 10s - loss: 0.7918 - mse: 0.7509 - val_loss: 0.7682 - val_mse: 0.7275 - lr: 5.0000e-04 - 10s/epoch - 2ms/step Epoch 59/500 6250/6250 - 10s - loss: 0.7911 - mse: 0.7498 - val_loss: 0.7729 - val_mse: 0.7315 - lr: 5.0000e-04 - 10s/epoch - 2ms/step Epoch 60/500 6250/6250 - 10s - loss: 0.7909 - mse: 0.7500 - val_loss: 0.7660 - val_mse: 0.7257 - lr: 5.0000e-04 - 10s/epoch - 2ms/step Epoch 61/500 6250/6250 - 10s - loss: 0.7920 - mse: 0.7512 - val_loss: 0.7668 - val_mse: 0.7254 - lr: 5.0000e-04 - 10s/epoch - 2ms/step Epoch 62/500 6250/6250 - 10s - loss: 0.7901 - mse: 0.7490 - val_loss: 0.7713 - val_mse: 0.7299 - lr: 5.0000e-04 - 10s/epoch - 2ms/step Epoch 63/500 6250/6250 - 10s - loss: 0.7910 - mse: 0.7497 - val_loss: 0.7689 - val_mse: 0.7271 - lr: 5.0000e-04 - 10s/epoch - 2ms/step Epoch 64/500 6250/6250 - 10s - loss: 0.7927 - mse: 0.7514 - val_loss: 0.7652 - val_mse: 0.7238 - lr: 5.0000e-04 - 10s/epoch - 2ms/step Epoch 65/500 6250/6250 - 10s - loss: 0.7920 - mse: 0.7504 - val_loss: 0.7692 - val_mse: 0.7285 - lr: 5.0000e-04 - 10s/epoch - 2ms/step Epoch 66/500 6250/6250 - 10s - loss: 0.7907 - mse: 0.7499 - val_loss: 0.7678 - val_mse: 0.7265 - lr: 5.0000e-04 - 10s/epoch - 2ms/step Epoch 67/500 6250/6250 - 10s - loss: 0.7908 - mse: 0.7495 - val_loss: 0.7686 - val_mse: 0.7264 - lr: 5.0000e-04 - 10s/epoch - 2ms/step Epoch 68/500 6250/6250 - 10s - loss: 0.7910 - mse: 0.7497 - val_loss: 0.7676 - val_mse: 0.7263 - lr: 5.0000e-04 - 10s/epoch - 2ms/step Epoch 69/500 6250/6250 - 10s - loss: 0.7902 - mse: 0.7492 - val_loss: 0.7665 - val_mse: 0.7258 - lr: 5.0000e-04 - 10s/epoch - 2ms/step Epoch 70/500 6250/6250 - 10s - loss: 0.7719 - mse: 0.7358 - val_loss: 0.7513 - val_mse: 0.7179 - lr: 2.5000e-04 - 10s/epoch - 2ms/step Epoch 71/500 6250/6250 - 11s - loss: 0.7635 - mse: 0.7318 - val_loss: 0.7458 - val_mse: 0.7155 - lr: 2.5000e-04 - 11s/epoch - 2ms/step Epoch 72/500 6250/6250 - 11s - loss: 0.7615 - mse: 0.7319 - val_loss: 0.7464 - val_mse: 0.7176 - lr: 2.5000e-04 - 11s/epoch - 2ms/step Epoch 73/500 6250/6250 - 10s - loss: 0.7597 - mse: 0.7310 - val_loss: 0.7492 - val_mse: 0.7208 - lr: 2.5000e-04 - 10s/epoch - 2ms/step Epoch 74/500 6250/6250 - 11s - loss: 0.7587 - mse: 0.7304 - val_loss: 0.7459 - val_mse: 0.7179 - lr: 2.5000e-04 - 11s/epoch - 2ms/step Epoch 75/500 6250/6250 - 11s - loss: 0.7580 - mse: 0.7300 - val_loss: 0.7417 - val_mse: 0.7138 - lr: 2.5000e-04 - 11s/epoch - 2ms/step Epoch 76/500 6250/6250 - 10s - loss: 0.7587 - mse: 0.7308 - val_loss: 0.7438 - val_mse: 0.7161 - lr: 2.5000e-04 - 10s/epoch - 2ms/step Epoch 77/500 6250/6250 - 10s - loss: 0.7588 - mse: 0.7308 - val_loss: 0.7443 - val_mse: 0.7161 - lr: 2.5000e-04 - 10s/epoch - 2ms/step Epoch 78/500 6250/6250 - 10s - loss: 0.7572 - mse: 0.7293 - val_loss: 0.7432 - val_mse: 0.7153 - lr: 2.5000e-04 - 10s/epoch - 2ms/step Epoch 79/500 6250/6250 - 10s - loss: 0.7573 - mse: 0.7292 - val_loss: 0.7444 - val_mse: 0.7163 - lr: 2.5000e-04 - 10s/epoch - 2ms/step Epoch 80/500 6250/6250 - 10s - loss: 0.7578 - mse: 0.7296 - val_loss: 0.7442 - val_mse: 0.7160 - lr: 2.5000e-04 - 10s/epoch - 2ms/step Epoch 81/500 6250/6250 - 10s - loss: 0.7572 - mse: 0.7291 - val_loss: 0.7453 - val_mse: 0.7173 - lr: 2.5000e-04 - 10s/epoch - 2ms/step Epoch 82/500 6250/6250 - 10s - loss: 0.7574 - mse: 0.7291 - val_loss: 0.7440 - val_mse: 0.7155 - lr: 2.5000e-04 - 10s/epoch - 2ms/step Epoch 83/500 6250/6250 - 10s - loss: 0.7568 - mse: 0.7283 - val_loss: 0.7453 - val_mse: 0.7169 - lr: 2.5000e-04 - 10s/epoch - 2ms/step Epoch 84/500 6250/6250 - 10s - loss: 0.7568 - mse: 0.7284 - val_loss: 0.7460 - val_mse: 0.7175 - lr: 2.5000e-04 - 10s/epoch - 2ms/step Epoch 85/500 6250/6250 - 10s - loss: 0.7576 - mse: 0.7291 - val_loss: 0.7456 - val_mse: 0.7171 - lr: 2.5000e-04 - 10s/epoch - 2ms/step Epoch 86/500 6250/6250 - 10s - loss: 0.7567 - mse: 0.7281 - val_loss: 0.7439 - val_mse: 0.7153 - lr: 2.5000e-04 - 10s/epoch - 2ms/step Epoch 87/500 6250/6250 - 10s - loss: 0.7570 - mse: 0.7284 - val_loss: 0.7460 - val_mse: 0.7174 - lr: 2.5000e-04 - 10s/epoch - 2ms/step Epoch 88/500 6250/6250 - 10s - loss: 0.7580 - mse: 0.7292 - val_loss: 0.7441 - val_mse: 0.7154 - lr: 2.5000e-04 - 10s/epoch - 2ms/step Epoch 89/500 6250/6250 - 10s - loss: 0.7570 - mse: 0.7282 - val_loss: 0.7450 - val_mse: 0.7164 - lr: 2.5000e-04 - 10s/epoch - 2ms/step Epoch 90/500 6250/6250 - 10s - loss: 0.7569 - mse: 0.7281 - val_loss: 0.7460 - val_mse: 0.7168 - lr: 2.5000e-04 - 10s/epoch - 2ms/step Epoch 91/500 6250/6250 - 10s - loss: 0.7446 - mse: 0.7174 - val_loss: 0.7374 - val_mse: 0.7113 - lr: 1.2500e-04 - 10s/epoch - 2ms/step Epoch 92/500 6250/6250 - 10s - loss: 0.7396 - mse: 0.7141 - val_loss: 0.7358 - val_mse: 0.7111 - lr: 1.2500e-04 - 10s/epoch - 2ms/step Epoch 93/500 6250/6250 - 10s - loss: 0.7368 - mse: 0.7127 - val_loss: 0.7372 - val_mse: 0.7135 - lr: 1.2500e-04 - 10s/epoch - 2ms/step Epoch 94/500 6250/6250 - 10s - loss: 0.7350 - mse: 0.7117 - val_loss: 0.7357 - val_mse: 0.7127 - lr: 1.2500e-04 - 10s/epoch - 2ms/step Epoch 95/500 6250/6250 - 10s - loss: 0.7340 - mse: 0.7114 - val_loss: 0.7337 - val_mse: 0.7114 - lr: 1.2500e-04 - 10s/epoch - 2ms/step Epoch 96/500 6250/6250 - 10s - loss: 0.7339 - mse: 0.7120 - val_loss: 0.7348 - val_mse: 0.7131 - lr: 1.2500e-04 - 10s/epoch - 2ms/step Epoch 97/500 6250/6250 - 10s - loss: 0.7317 - mse: 0.7102 - val_loss: 0.7347 - val_mse: 0.7133 - lr: 1.2500e-04 - 10s/epoch - 2ms/step Epoch 98/500 6250/6250 - 10s - loss: 0.7323 - mse: 0.7111 - val_loss: 0.7345 - val_mse: 0.7134 - lr: 1.2500e-04 - 10s/epoch - 2ms/step Epoch 99/500 6250/6250 - 10s - loss: 0.7327 - mse: 0.7118 - val_loss: 0.7325 - val_mse: 0.7117 - lr: 1.2500e-04 - 10s/epoch - 2ms/step Epoch 100/500 6250/6250 - 10s - loss: 0.7325 - mse: 0.7117 - val_loss: 0.7319 - val_mse: 0.7112 - lr: 1.2500e-04 - 10s/epoch - 2ms/step Epoch 101/500 6250/6250 - 10s - loss: 0.7312 - mse: 0.7105 - val_loss: 0.7322 - val_mse: 0.7116 - lr: 1.2500e-04 - 10s/epoch - 2ms/step Epoch 102/500 6250/6250 - 10s - loss: 0.7317 - mse: 0.7112 - val_loss: 0.7337 - val_mse: 0.7133 - lr: 1.2500e-04 - 10s/epoch - 2ms/step Epoch 103/500 6250/6250 - 10s - loss: 0.7312 - mse: 0.7107 - val_loss: 0.7330 - val_mse: 0.7125 - lr: 1.2500e-04 - 10s/epoch - 2ms/step Epoch 104/500 6250/6250 - 10s - loss: 0.7303 - mse: 0.7098 - val_loss: 0.7327 - val_mse: 0.7122 - lr: 1.2500e-04 - 10s/epoch - 2ms/step Epoch 105/500 6250/6250 - 10s - loss: 0.7302 - mse: 0.7097 - val_loss: 0.7338 - val_mse: 0.7132 - lr: 1.2500e-04 - 10s/epoch - 2ms/step Epoch 106/500 6250/6250 - 10s - loss: 0.7302 - mse: 0.7097 - val_loss: 0.7331 - val_mse: 0.7128 - lr: 1.2500e-04 - 10s/epoch - 2ms/step Epoch 107/500 6250/6250 - 10s - loss: 0.7302 - mse: 0.7098 - val_loss: 0.7342 - val_mse: 0.7138 - lr: 1.2500e-04 - 10s/epoch - 2ms/step Epoch 108/500 6250/6250 - 10s - loss: 0.7216 - mse: 0.7017 - val_loss: 0.7319 - val_mse: 0.7123 - lr: 6.2500e-05 - 10s/epoch - 2ms/step Epoch 109/500 6250/6250 - 10s - loss: 0.7185 - mse: 0.6992 - val_loss: 0.7304 - val_mse: 0.7114 - lr: 6.2500e-05 - 10s/epoch - 2ms/step Epoch 110/500 6250/6250 - 10s - loss: 0.7176 - mse: 0.6988 - val_loss: 0.7295 - val_mse: 0.7109 - lr: 6.2500e-05 - 10s/epoch - 2ms/step Epoch 111/500 6250/6250 - 10s - loss: 0.7162 - mse: 0.6978 - val_loss: 0.7301 - val_mse: 0.7119 - lr: 6.2500e-05 - 10s/epoch - 2ms/step Epoch 112/500 6250/6250 - 10s - loss: 0.7157 - mse: 0.6977 - val_loss: 0.7294 - val_mse: 0.7116 - lr: 6.2500e-05 - 10s/epoch - 2ms/step Epoch 113/500 6250/6250 - 10s - loss: 0.7145 - mse: 0.6969 - val_loss: 0.7296 - val_mse: 0.7121 - lr: 6.2500e-05 - 10s/epoch - 2ms/step Epoch 114/500 6250/6250 - 10s - loss: 0.7139 - mse: 0.6966 - val_loss: 0.7287 - val_mse: 0.7115 - lr: 6.2500e-05 - 10s/epoch - 2ms/step Epoch 115/500 6250/6250 - 10s - loss: 0.7150 - mse: 0.6979 - val_loss: 0.7300 - val_mse: 0.7129 - lr: 6.2500e-05 - 10s/epoch - 2ms/step Epoch 116/500 6250/6250 - 10s - loss: 0.7140 - mse: 0.6971 - val_loss: 0.7295 - val_mse: 0.7127 - lr: 6.2500e-05 - 10s/epoch - 2ms/step Epoch 117/500 6250/6250 - 10s - loss: 0.7129 - mse: 0.6962 - val_loss: 0.7296 - val_mse: 0.7130 - lr: 6.2500e-05 - 10s/epoch - 2ms/step Epoch 118/500 6250/6250 - 10s - loss: 0.7128 - mse: 0.6963 - val_loss: 0.7287 - val_mse: 0.7123 - lr: 6.2500e-05 - 10s/epoch - 2ms/step Epoch 119/500 6250/6250 - 10s - loss: 0.7124 - mse: 0.6961 - val_loss: 0.7284 - val_mse: 0.7122 - lr: 6.2500e-05 - 10s/epoch - 2ms/step Epoch 120/500 6250/6250 - 10s - loss: 0.7128 - mse: 0.6967 - val_loss: 0.7275 - val_mse: 0.7115 - lr: 6.2500e-05 - 10s/epoch - 2ms/step Epoch 121/500 6250/6250 - 10s - loss: 0.7128 - mse: 0.6968 - val_loss: 0.7283 - val_mse: 0.7124 - lr: 6.2500e-05 - 10s/epoch - 2ms/step Epoch 122/500 6250/6250 - 10s - loss: 0.7121 - mse: 0.6962 - val_loss: 0.7290 - val_mse: 0.7133 - lr: 6.2500e-05 - 10s/epoch - 2ms/step Epoch 123/500 6250/6250 - 10s - loss: 0.7126 - mse: 0.6968 - val_loss: 0.7282 - val_mse: 0.7125 - lr: 6.2500e-05 - 10s/epoch - 2ms/step Epoch 124/500 6250/6250 - 10s - loss: 0.7113 - mse: 0.6957 - val_loss: 0.7291 - val_mse: 0.7135 - lr: 6.2500e-05 - 10s/epoch - 2ms/step Epoch 125/500 6250/6250 - 10s - loss: 0.7117 - mse: 0.6962 - val_loss: 0.7277 - val_mse: 0.7123 - lr: 6.2500e-05 - 10s/epoch - 2ms/step Epoch 126/500 6250/6250 - 10s - loss: 0.7057 - mse: 0.6905 - val_loss: 0.7272 - val_mse: 0.7120 - lr: 3.1250e-05 - 10s/epoch - 2ms/step Epoch 127/500 6250/6250 - 9s - loss: 0.7043 - mse: 0.6893 - val_loss: 0.7274 - val_mse: 0.7125 - lr: 3.1250e-05 - 9s/epoch - 2ms/step Epoch 128/500 6250/6250 - 9s - loss: 0.7032 - mse: 0.6884 - val_loss: 0.7266 - val_mse: 0.7118 - lr: 3.1250e-05 - 9s/epoch - 2ms/step Epoch 129/500 6250/6250 - 10s - loss: 0.7023 - mse: 0.6876 - val_loss: 0.7268 - val_mse: 0.7123 - lr: 3.1250e-05 - 10s/epoch - 2ms/step Epoch 130/500 6250/6250 - 9s - loss: 0.7019 - mse: 0.6874 - val_loss: 0.7270 - val_mse: 0.7126 - lr: 3.1250e-05 - 9s/epoch - 1ms/step Epoch 131/500 6250/6250 - 9s - loss: 0.7010 - mse: 0.6867 - val_loss: 0.7280 - val_mse: 0.7137 - lr: 3.1250e-05 - 9s/epoch - 1ms/step Epoch 132/500 6250/6250 - 10s - loss: 0.7015 - mse: 0.6873 - val_loss: 0.7267 - val_mse: 0.7126 - lr: 3.1250e-05 - 10s/epoch - 2ms/step Epoch 133/500 6250/6250 - 9s - loss: 0.7002 - mse: 0.6862 - val_loss: 0.7269 - val_mse: 0.7130 - lr: 3.1250e-05 - 9s/epoch - 2ms/step Epoch 134/500 6250/6250 - 10s - loss: 0.7009 - mse: 0.6871 - val_loss: 0.7275 - val_mse: 0.7137 - lr: 3.1250e-05 - 10s/epoch - 2ms/step Epoch 135/500 6250/6250 - 10s - loss: 0.7006 - mse: 0.6869 - val_loss: 0.7267 - val_mse: 0.7131 - lr: 3.1250e-05 - 10s/epoch - 2ms/step Epoch 136/500 6250/6250 - 10s - loss: 0.7007 - mse: 0.6871 - val_loss: 0.7269 - val_mse: 0.7134 - lr: 3.1250e-05 - 10s/epoch - 2ms/step Epoch 137/500 6250/6250 - 9s - loss: 0.6992 - mse: 0.6857 - val_loss: 0.7272 - val_mse: 0.7139 - lr: 3.1250e-05 - 9s/epoch - 2ms/step Epoch 138/500 6250/6250 - 10s - loss: 0.6985 - mse: 0.6852 - val_loss: 0.7278 - val_mse: 0.7145 - lr: 3.1250e-05 - 10s/epoch - 2ms/step Epoch 139/500 6250/6250 - 10s - loss: 0.6992 - mse: 0.6860 - val_loss: 0.7271 - val_mse: 0.7140 - lr: 3.1250e-05 - 10s/epoch - 2ms/step Epoch 140/500 6250/6250 - 10s - loss: 0.6995 - mse: 0.6864 - val_loss: 0.7273 - val_mse: 0.7143 - lr: 3.1250e-05 - 10s/epoch - 2ms/step
model = load_model(model_path)
y_pred = model.predict(X_train)
print("Predictions on the test set:")
print(y_pred)
# %%
# Calculate Mean Squared Error (MSE)
mse_train = mean_squared_error(y_train, y_pred)
# Calculate R-squared
r2_train = r2_score(y_train, y_pred)
# Print the results
print(f"Training MSE: {mse_train}")
print(f"Training R-squared: {r2_train}")
# %%
y_pred_test = model.predict(X_test)
print(y_pred_test)
15625/15625 [==============================] - 7s 432us/step Predictions on the test set: [[3.1388288] [2.7456193] [4.656323 ] ... [3.80933 ] [2.9484968] [2.980651 ]] Training MSE: 0.6588238189223193 Training R-squared: 0.4645265585273568 3125/3125 [==============================] - 1s 433us/step [[4.6070895] [4.155876 ] [4.70691 ] ... [3.6154668] [3.5599217] [4.668838 ]]
# Step 1: Define your information
student_id = '20952031'
anonymized_name = '808'
prediction_accuracy = r2_train
algorithm_name = 'Neural Network Model'
data = [
[student_id],
[anonymized_name],
[prediction_accuracy],
[algorithm_name]
] + [pred for pred in y_pred_test]
df = pd.DataFrame(data)
df.to_csv('final_perdictions_learn1.csv', header=False, index=False)
print("CSV file created successfully.")
CSV file created successfully.
print(y_pred_test)
[[4.6070895] [4.155876 ] [4.70691 ] ... [3.6154668] [3.5599217] [4.668838 ]]
# Create the Kaggle-style DataFrame
y_data_kaggle = []
for pred in y_pred_test:
for val in pred:
y_data_kaggle.append(val)
data = {
"ID_num": range(1, len(y_data_kaggle) + 1),
"prediction": y_data_kaggle
}
df = pd.DataFrame(data)
# Save to CSV
df.to_csv('kaggle_predictions1.csv', index=False)
print("CSV file created successfully in Kaggle format.")
CSV file created successfully in Kaggle format.
model.summary()
Model: "sequential"
_________________________________________________________________
Layer (type) Output Shape Param #
=================================================================
dense (Dense) (None, 256) 256512
batch_normalization (BatchN (None, 256) 1024
ormalization)
leaky_re_lu (LeakyReLU) (None, 256) 0
dropout (Dropout) (None, 256) 0
dense_1 (Dense) (None, 128) 32896
batch_normalization_1 (Batc (None, 128) 512
hNormalization)
leaky_re_lu_1 (LeakyReLU) (None, 128) 0
dropout_1 (Dropout) (None, 128) 0
dense_2 (Dense) (None, 64) 8256
batch_normalization_2 (Batc (None, 64) 256
hNormalization)
leaky_re_lu_2 (LeakyReLU) (None, 64) 0
dropout_2 (Dropout) (None, 64) 0
dense_3 (Dense) (None, 32) 2080
batch_normalization_3 (Batc (None, 32) 128
hNormalization)
leaky_re_lu_3 (LeakyReLU) (None, 32) 0
dense_4 (Dense) (None, 1) 33
=================================================================
Total params: 301,697
Trainable params: 300,737
Non-trainable params: 960
_________________________________________________________________
_________________________________________________________________
Layer (type) Output Shape Param #
=================================================================
dense (Dense) (None, 256) 256512
batch_normalization (BatchN (None, 256) 1024
ormalization)
leaky_re_lu (LeakyReLU) (None, 256) 0
dropout (Dropout) (None, 256) 0
dense_1 (Dense) (None, 128) 32896
batch_normalization_1 (Batc (None, 128) 512
hNormalization)
leaky_re_lu_1 (LeakyReLU) (None, 128) 0
dropout_1 (Dropout) (None, 128) 0
dense_2 (Dense) (None, 64) 8256
batch_normalization_2 (Batc (None, 64) 256
hNormalization)
leaky_re_lu_2 (LeakyReLU) (None, 64) 0
dropout_2 (Dropout) (None, 64) 0
dense_3 (Dense) (None, 32) 2080
batch_normalization_3 (Batc (None, 32) 128
hNormalization)
leaky_re_lu_3 (LeakyReLU) (None, 32) 0
dense_4 (Dense) (None, 1) 33
=================================================================
Total params: 301,697
Trainable params: 300,737
Non-trainable params: 960
_________________________________________________________________
from keras.utils import plot_model
# Visualize model architecture
plot_model(model, to_file='model_structure.png', show_shapes=True, show_layer_names=True)
You must install pydot (`pip install pydot`) and install graphviz (see instructions at https://graphviz.gitlab.io/download/) for plot_model to work.
# Display the model structure image
plt.figure(figsize=(10, 35))
img = plt.imread('model_structure.png')
plt.imshow(img)
plt.axis('off')
plt.show()
from wordcloud import WordCloud
for star_rating in range(1, 6):
# Combine all 'pros' text for the current star rating
review_text = ' '.join(df_train[df_train['rating'] == star_rating]['pros'].tolist())
# Create a word cloud
wordcloud = WordCloud(width=800, height=400, background_color='white').generate(review_text)
# Display the word cloud
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.title(f'Word Cloud for {star_rating}-Star Reviews')
plt.axis('off') # Hide axes for better visualization
plt.show()
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
# Encode categorical features
label_encoder = LabelEncoder()
df_train['firm_encoded'] = label_encoder.fit_transform(df_train['firm'].fillna(''))
df_train['job_title_encoded'] = label_encoder.fit_transform(df_train['job_title'].fillna(''))
# Define features and target variable
feature_columns = ['year_review', 'firm_encoded', 'job_title_encoded']
target_column = 'rating' # Predicting the 'rating'
# Prepare data for training and testing
X = df_train[feature_columns]
y = df_train[target_column]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Initialize and train the Random Forest model
random_forest = RandomForestRegressor(n_estimators=100, max_depth=7, random_state=42)
random_forest.fit(X_train, y_train)
# Extract feature importance scores
feature_importances = random_forest.feature_importances_
# Visualize feature importance
plt.figure(figsize=(8, 6))
plt.barh(feature_columns, feature_importances, color='skyblue')
plt.title("Feature Importance for Rating Prediction")
plt.xlabel("Importance Score")
plt.ylabel("Feature Names")
plt.tight_layout() # Adjust layout to fit title and labels
plt.show()
correlation_matrix = X.corr()
# Plot the heatmap
plt.figure(figsize=(10, 8)) # Adjust figure size as needed
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Correlation Matrix for Features')
plt.show()
# Plot the distribution of each feature in both the training and test datasets
# Visualize the distribution of each feature in the training and testing datasets
for feature_name in feature_columns: # Renamed 'features' to 'feature_columns' for consistency
plt.figure(figsize=(8, 6))
# Plot the distribution of the feature in the training data
plt.hist(
X_train[feature_name],
bins=30,
alpha=0.7,
color='blue',
label='Training Data',
density=True
)
# Plot the distribution of the feature in the test data
plt.hist(
X_test[feature_name],
bins=30,
alpha=0.7,
color='red',
label='Testing Data',
density=True
)
# Add title and labels
plt.title(f"Distribution of '{feature_name}' in Training vs Testing Data")
plt.xlabel(f"{feature_name}")
plt.ylabel("Density")
plt.legend() # Show the legend to distinguish between Train and Test data
plt.tight_layout() # Ensure the layout fits nicely
plt.show()
# Predict ratings for both training and test sets
train_predictions = random_forest.predict(X_train)
test_predictions = random_forest.predict(X_test)
# Plot the distribution of actual vs. predicted ratings for training and test datasets
plt.figure(figsize=(12, 6))
# Plot for the training data
plt.subplot(1, 2, 1)
plt.hist(y_train, bins=30, alpha=0.7, color='blue', label='Actual (Training)', density=True)
plt.hist(train_predictions, bins=30, alpha=0.7, color='red', label='Predicted (Training)', density=True)
plt.title("Actual vs. Predicted Ratings: Training Data")
plt.xlabel("Rating")
plt.ylabel("Density")
plt.legend()
# Plot for the test data
plt.subplot(1, 2, 2)
plt.hist(y_test, bins=30, alpha=0.7, color='blue', label='Actual (Testing)', density=True)
plt.hist(test_predictions, bins=30, alpha=0.7, color='red', label='Predicted (Testing)', density=True)
plt.title("Actual vs. Predicted Ratings: Testing Data")
plt.xlabel("Rating")
plt.ylabel("Density")
plt.legend()
# Adjust layout for better visualization
plt.tight_layout()
plt.show()
# Calculate residuals (errors) for training and testing datasets
train_residuals = y_train - train_predictions
test_residuals = y_test - test_predictions
# Visualize the distribution of residuals
plt.figure(figsize=(12, 6))
# Residual plot for training data
plt.subplot(1, 2, 1)
plt.scatter(y_train, train_residuals, alpha=0.5, color='blue')
plt.axhline(0, color='black', linestyle='--', linewidth=1) # Reference line at 0
plt.title("Actual Ratings vs Residuals: Training Data")
plt.xlabel("Actual Rating")
plt.ylabel("Residual (Error)")
# Residual plot for testing data
plt.subplot(1, 2, 2)
plt.scatter(y_test, test_residuals, alpha=0.5, color='red')
plt.axhline(0, color='black', linestyle='--', linewidth=1) # Reference line at 0
plt.title("Actual Ratings vs Residuals: Testing Data")
plt.xlabel("Actual Rating")
plt.ylabel("Residual (Error)")
# Adjust layout and show the plots
plt.tight_layout()
plt.show()
def get_top_phrases_by_rating(df, rating_threshold):
subset = df[df['rating'] >= rating_threshold]
combined_text = ' '.join(subset['pros'] + ' ' + subset['cons'] + ' ' + subset['headline'])
word_freq = pd.Series(combined_text.split()).value_counts().head(10)
return word_freq
high_rating_phrases = get_top_phrases_by_rating(df_train, 4)
low_rating_phrases = get_top_phrases_by_rating(df_train, 2)
print("Top phrases in high-rated firms:")
print(high_rating_phrases)
print("\nTop phrases in low-rated firms:")
print(low_rating_phrases)
Top phrases in high-rated firms: work 248368 good 202102 great 197599 company 124557 people 84598 place 76033 opportunity 65366 working 59299 hour 55052 lot 54680 Name: count, dtype: int64 Top phrases in low-rated firms: work 358793 good 309259 great 250627 company 173852 people 135448 place 98262 opportunity 91781 management 88760 hour 88193 working 87336 Name: count, dtype: int64
import pandas as pd
import matplotlib.pyplot as plt
from wordcloud import WordCloud
from sklearn.feature_extraction.text import CountVectorizer
from collections import Counter
# Load data
df = pd.read_csv(large_train)
# Categorize firms by average rating (low: 1-2, medium: 3-4, high: 5)
df['rating_group'] = pd.cut(df['rating'], bins=[0, 2, 4, 5], labels=['Low', 'Medium', 'High'])
# Function to extract top words/themes
def extract_themes(text_column, top_n=10):
vectorizer = CountVectorizer(stop_words='english')
word_counts = vectorizer.fit_transform(text_column.dropna())
word_sum = word_counts.sum(axis=0)
words_freq = [(word, word_sum[0, idx]) for word, idx in vectorizer.vocabulary_.items()]
words_freq = sorted(words_freq, key=lambda x: x[1], reverse=True)[:top_n]
return words_freq
# Analyze themes for pros and cons by rating group
themes = {}
for group in df['rating_group'].unique():
group_data = df[df['rating_group'] == group]
pros_themes = extract_themes(group_data['pros'], top_n=10)
cons_themes = extract_themes(group_data['cons'], top_n=10)
themes[group] = {'pros': pros_themes, 'cons': cons_themes}
# Display results in a table
for group, data in themes.items():
print(f"\n=== {group} Rated Firms ===")
print("Top Pros:", data['pros'])
print("Top Cons:", data['cons'])
# Optional: Create word clouds for visualization
def plot_wordcloud(text, title):
wordcloud = WordCloud(width=800, height=400, background_color='white').generate(" ".join(text))
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title(title, fontsize=16)
plt.show()
for group in df['rating_group'].unique():
group_data = df[df['rating_group'] == group]
plot_wordcloud(group_data['pros'].dropna(), f"{group} Rated Firms - Pros")
plot_wordcloud(group_data['cons'].dropna(), f"{group} Rated Firms - Cons")
=== Medium Rated Firms ===
Top Pros: [('good', 135574), ('work', 116435), ('great', 86490), ('people', 55525), ('benefits', 39715), ('company', 35581), ('environment', 31546), ('opportunities', 29698), ('working', 28386), ('life', 27891)]
Top Cons: [('work', 73507), ('management', 37321), ('hours', 36969), ('long', 27701), ('pay', 26397), ('company', 24463), ('life', 22969), ('people', 21708), ('time', 20808), ('balance', 20277)]
=== Low Rated Firms ===
Top Pros: [('good', 24406), ('work', 17873), ('great', 11024), ('people', 10598), ('benefits', 7689), ('company', 6677), ('nice', 5289), ('pay', 5244), ('working', 5111), ('job', 4347)]
Top Cons: [('work', 27668), ('management', 26298), ('people', 15191), ('company', 12934), ('pay', 10983), ('staff', 10689), ('managers', 10510), ('employees', 10105), ('time', 9444), ('hours', 9041)]
=== High Rated Firms ===
Top Pros: [('work', 62408), ('great', 58361), ('good', 48450), ('people', 28143), ('company', 24747), ('culture', 19808), ('benefits', 18672), ('environment', 18437), ('opportunities', 15378), ('working', 14786)]
Top Cons: [('work', 26527), ('hours', 15501), ('long', 12257), ('cons', 12144), ('company', 11643), ('working', 9138), ('time', 8740), ('life', 7328), ('think', 7211), ('good', 6785)]